import numpy as np
import pandas as pd
import copy
import geopandas as gpd
import plotly.graph_objs as go
import plotly as py
from plotly.offline import init_notebook_mode, iplot
import matplotlib.pyplot as plt
states = pd.read_excel("./messy_data/states.xlsx")
states.head()
states.drop(states.index[states.Abbreviation == "DC"], inplace=True)
states.columns = ["Jurisdiction", "Jurisdiction Abbreviation"]
states = states.append({"Jurisdiction" : "Federal", "Jurisdiction Abbreviation" : "FED"}, ignore_index=True)
states.tail()
malePrisonerPopulation = pd.read_excel("./messy_data/Prisoners under the jurisdiction of state or federal correctional authorities 1978-2016.xlsx", sheet_name="Male", header=9, nrows=54).dropna(1, "all").dropna(0)
malePrisonerPopulation.head()
femalePrisonerPopulation = pd.read_excel("./messy_data/Prisoners under the jurisdiction of state or federal correctional authorities 1978-2016.xlsx", sheet_name="Female", header=9, nrows=54).dropna(1, "all").dropna(0)
femalePrisonerPopulation.head()
totalPrisonerPopulation = pd.read_excel("./messy_data/Prisoners under the jurisdiction of state or federal correctional authorities 1978-2016.xlsx", sheet_name="Total", header=9, nrows=54).dropna(1, "all").dropna(0)
totalPrisonerPopulation.head()
def cleanAndMeltPopulationTable(table, states):
# add DC data to Federal, and delete it
table.loc[table.Jurisdiction == "Federal"].iloc[:, 1:] = table.loc[table.Jurisdiction == "Federal"].iloc[:, 1:] + table.loc[table.Jurisdiction == "District of Columbia"].iloc[:, 1:].replace("--", 0)
table.drop(table.index[table.Jurisdiction == "District of Columbia"], inplace=True)
table = table.melt(id_vars = "Jurisdiction", var_name = "Year", value_name = "Population")
table.Year = table.Year.astype(int)
table = states.merge(table, on="Jurisdiction")
return table
meltedMalePrisonerPopulation = cleanAndMeltPopulationTable(malePrisonerPopulation, states)
meltedFemalePrisonerPopulation = cleanAndMeltPopulationTable(femalePrisonerPopulation, states)
meltedTotalPrisonerPopulation = cleanAndMeltPopulationTable(totalPrisonerPopulation, states)
meltedTotalPrisonerPopulation.head()
regions = pd.read_excel("./messy_data/state_region.xlsx")
regions.head()
regions.drop(regions.index[regions.State == "District of Columbia"], inplace=True)
def aggregateRegionalPopulationSum(table, regions):
table = regions.merge(table, left_on="State", right_on="Jurisdiction", how="right")
temp_index = table["Jurisdiction"] == "Federal"
table.loc[temp_index, "Region"] = "Federal"
table.loc[temp_index, "Division"] = "Federal"
regionSum = table.groupby(["Year", "Region"]).Population.sum()
divisionSum = table.groupby(["Year", "Region", "Division"]).Population.sum()
return regionSum, divisionSum
malePrisonerPopulationRegionSum, malePrisonerPopulationDivisionSum = aggregateRegionalPopulationSum(meltedMalePrisonerPopulation, regions)
femalePrisonerPopulationRegionSum, femalePrisonerPopulationDivisionSum = aggregateRegionalPopulationSum(meltedFemalePrisonerPopulation, regions)
totalPrisonerPopulationRegionSum, totalPrisonerPopulationDivisionSum = aggregateRegionalPopulationSum(meltedTotalPrisonerPopulation, regions)
def plotAnnualStatePopulation(table, gender):
table = table[table.Jurisdiction != "Federal"]
cmax = table.Population.max()
cmin = table.Population.min()
years = table.Year.unique()
heatMapData = [{"type" : "choropleth", "locations" : table.loc[table.Year == year, "Jurisdiction Abbreviation"], "locationmode" : "USA-states", "colorscale" : "Viridis", "zmin" : cmin, "zmax" : cmax, "z" : table.Population[table.Year == year].astype(float)} for year in years]
mapLayout = [{"geo" : {"scope" : 'usa'}, "title" : gender + " Prisoner Population by State, " + str(year)} for year in years]
for i, data in enumerate(heatMapData):
usHeatMap = go.Figure(data=[data], layout=mapLayout[i])
iplot(usHeatMap)
# gender = "Total"
# cmax = table.Population.max()
# cmin = table.Population.min()
# heatMapData = [dict(type = "choropleth", locations=table.loc[table.Year == year, "Jurisdiction Abbreviation"], locationmode="USA-states", colorscale="Viridis", zmin=cmin, zmax=cmax, z=table.Population[table.Year == year].astype(float), text = str(year), geo = 'geo'+str(i+1) if i != 0 else 'geo') for i, year in enumerate(years)]
# mapLayout = {"geo" + (str(i + 1) if i != 0 else "" ) : {"scope" : 'usa', "domain" : dict(x=[0, 1], y=[i / len(years), (i + 1) / len(years)])} for i in range(len(years))}
# fig = {'data' : heatMapData, 'layout' : mapLayout}
# iplot(fig)
init_notebook_mode()
plotAnnualStatePopulation(meltedMalePrisonerPopulation, "Male")
plotAnnualStatePopulation(meltedFemalePrisonerPopulation, "Female")
plotAnnualStatePopulation(meltedTotalPrisonerPopulation, "Total")
usMap = gpd.read_file("./us_states_map.json")
usMap
usMap = usMap.loc[list(map(lambda x: x not in ["Puerto Rico", "District of Columbia"], usMap.NAME))]
usMap
def plotGeoPandasUSMap(usMap, title=None, dataColumn=None, dataLimit=(None, None)):
fig, mapAx = plt.subplots(1, 1)
mapLimitW, mapLimitE, mapLimitS, mapLimitN = -185, -65, 15, 75
mapAx.axis((mapLimitW, mapLimitE, mapLimitS, mapLimitN))
mapAx.axis('off')
mapAx.set_aspect('equal', 'box')
mapAx.set_title(title)
usMap.plot(column=dataColumn, figsize=(20, 20), edgecolor="k", ax=mapAx, legend=True, vmin=dataLimit[0], vmax=dataLimit[1])
# sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=plt.Normalize(vmin=dataLimit[0], vmax=dataLimit[1]))
# sm._A = []
# cbar = mapAx.colorbar(sm)
usMap = usMap.merge(states, left_on="NAME", right_on = "Jurisdiction", how="left").drop("Jurisdiction", axis=1)
usMap.columns = ["GEO_ID", "STATE", "NAME", "LSAD", "CENSUSAREA", "geometry", "Abbreviatio"]
usMap.head()
usMap = usMap.merge(regions, left_on="NAME", right_on = "State", how="left").drop("State", axis=1)
usMap.head()
def plotRegionalPopulation(table, usMap, gender):
regionType = table.index.names[1]
tableWOFederal = table[table.index.get_level_values(1) != "Federal"]
years = table.index.get_level_values(0).unique()
dividedTableWOFederal = [tableWOFederal[year] for year in years]
usRegionMap = usMap[[regionType, "geometry"]].dissolve(regionType)
colorlimit = (tableWOFederal.min(), tableWOFederal.max())
for i, oneYearTable in enumerate(dividedTableWOFederal):
year = years[i]
oneYearTable = usRegionMap.join(oneYearTable)
figtitle = gender + " Prisoner Population by " + regionType + ", " + str(year)
plotGeoPandasUSMap(oneYearTable, figtitle, "Population", colorlimit)
figtitle = gender + " Prisoner Population by " + regionType
ax = table.unstack().plot(title = figtitle)
ax.set_ylim(bottom=0)
ax.tick_params(axis = "y", length=0)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=len(table.index.get_level_values(1).unique()))
plotRegionalPopulation(malePrisonerPopulationRegionSum, usMap, "Male")
plotRegionalPopulation(femalePrisonerPopulationRegionSum, usMap, "Female")
plotRegionalPopulation(totalPrisonerPopulationRegionSum, usMap, "Total")
def dropRegionIndex(table):
table = copy.deepcopy(table)
table.index = table.index.droplevel(1)
return table
plotRegionalPopulation(dropRegionIndex(malePrisonerPopulationDivisionSum), usMap, "Male")
plotRegionalPopulation(dropRegionIndex(femalePrisonerPopulationDivisionSum), usMap, "Female")
plotRegionalPopulation(dropRegionIndex(totalPrisonerPopulationDivisionSum), usMap, "Total")
custodyPopulation_11_16 = pd.read_excel("./messy_data/Prison occupation rate.xlsx", "Custody Population", na_values="/")
occupancyLowRate_11_16 = pd.read_excel("./messy_data/Prison occupation rate.xlsx", "Highest Capacity", na_values="/")
occupancyHighRate_11_16 = pd.read_excel("./messy_data/Prison occupation rate.xlsx", "Lowest Capacity", na_values="/")
custodyPopulation_11_16
def cleanAndMeltCustodyOccupancyTable(table, states, tableType):
table = table.melt(id_vars = "Jurisdiction", var_name = "Year", value_name = tableType)
if tableType == "Occupancy":
table.Occupancy = table.Occupancy / 100
table.Year = table.Year.astype(int)
table = states.merge(table, on="Jurisdiction")
return table
meltedCustodyPopulation_11_16 = cleanAndMeltCustodyOccupancyTable(custodyPopulation_11_16, states, "Population")
meltedOccupancyLowRate_11_16 = cleanAndMeltCustodyOccupancyTable(occupancyLowRate_11_16, states, "Occupancy")
meltedOccupancyHighRate_11_16 = cleanAndMeltCustodyOccupancyTable(occupancyHighRate_11_16, states, "Occupancy")
meltedCustodyPopulation_11_16.head()
meltedOccupancyHighRate_11_16.head()
def plotAnnualStateOccupancy(table, low_or_high):
table = table[table.Occupancy != "Federal"]
cmax = table.Occupancy.max()
cmin = table.Occupancy.min()
years = table.Year.unique()
heatMapData = [{"type" : "choropleth", "locations" : table.loc[table.Year == year, "Jurisdiction Abbreviation"], "locationmode" : "USA-states", "colorscale" : "Viridis", "zmin" : cmin, "zmax" : cmax, "z" : table.Occupancy[table.Year == year].astype(float)} for year in years]
mapLayout = [{"geo" : {"scope" : 'usa'}, "title" : "Prison Occupancy (" + low_or_high + " Estimation) by State, " + str(year)} for year in years]
for i, data in enumerate(heatMapData):
usHeatMap = go.Figure(data=[data], layout=mapLayout[i])
iplot(usHeatMap)
plotAnnualStateOccupancy(meltedOccupancyLowRate_11_16, "Low")
plotAnnualStateOccupancy(meltedOccupancyHighRate_11_16, "High")
def aggregateRegionalOccupancy(table, populationTable, regions):
table = regions.merge(table.dropna(), left_on="State", right_on="Jurisdiction", how="right")
table = table.merge(populationTable[["Jurisdiction", "Year", "Population"]].dropna(), on=["Jurisdiction", "Year"], how="left")
temp_index = table["Jurisdiction"] == "Federal"
table.loc[temp_index, "Region"] = "Federal"
table.loc[temp_index, "Division"] = "Federal"
table["Capacity"] = table.Population / table.Occupancy
regionCapacity = table.groupby(["Year", "Region"]).Capacity.sum()
divisionCapacity = table.groupby(["Year", "Region", "Division"]).Capacity.sum()
regionPopulation = table.groupby(["Year", "Region"]).Population.sum()
divisionPopulation = table.groupby(["Year", "Region", "Division"]).Population.sum()
regionOccupancy = (regionPopulation / regionCapacity).rename("Occupancy")
divisionOccupancy = (divisionPopulation / divisionCapacity).rename("Occupancy")
return regionOccupancy, divisionOccupancy
regionLowOccupancy, divisionLowOccupancy = aggregateRegionalOccupancy(meltedOccupancyLowRate_11_16, meltedCustodyPopulation_11_16, regions)
regionHighOccupancy, divisionHighOccupancy = aggregateRegionalOccupancy(meltedOccupancyHighRate_11_16, meltedCustodyPopulation_11_16, regions)
regionLowOccupancy
divisionLowOccupancy
def plotRegionalOccupancy(table, usMap, low_or_high):
regionType = table.index.names[1]
tableWOFederal = table[table.index.get_level_values(1) != "Federal"]
years = table.index.get_level_values(0).unique()
dividedTableWOFederal = [tableWOFederal[year] for year in years]
usRegionMap = usMap[[regionType, "geometry"]].dissolve(regionType)
colorlimit = (tableWOFederal.min(), tableWOFederal.max())
for i, oneYearTable in enumerate(dividedTableWOFederal):
year = years[i]
oneYearTable = usRegionMap.join(oneYearTable)
figtitle = "Prison Occupancy (" + low_or_high + " Estimate) by " + regionType + ", " + str(year)
plotGeoPandasUSMap(oneYearTable, figtitle, "Occupancy", colorlimit)
figtitle = "Prison Occupancy (" + low_or_high + " Estimate) by " + regionType
ax = table.unstack().plot(title = figtitle)
ax.tick_params(axis = "y", length=0)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=len(table.index.get_level_values(1).unique()))
plotRegionalOccupancy(regionLowOccupancy, usMap, "Low")
plotRegionalOccupancy(regionHighOccupancy, usMap, "High")
plotRegionalOccupancy(dropRegionIndex(divisionLowOccupancy), usMap, "Low")
plotRegionalOccupancy(dropRegionIndex(divisionHighOccupancy), usMap, "High")